In [8]:

    
import dask.array as da

Import dask and specifically dataframe class

Dask is a thread process scheduler and ...



In [7]:

    
import numpy as np
x = np.arange(25)



In [3]:

    
x









    Out[3]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24])



In [9]:

    
x = da.arange(25, chunks=(5,))

y = x ** 2

y

y.visualize()









    Out[9]:



In [11]:

    
da.sqrt(x)[-1].visualize()

x = da.arange(250, chunks=(5,))

x.visualize()









    Out[11]:



In [13]:

    
x = da.ones((15, 15), chunks=(5,5))
x.sum(axis=1).visualize()









    Out[13]:



In [14]:

    
import dask.multiprocessing

y.compute(get = dask.multiprocessing.get)









    Out[14]:





array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121, 144,
       169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576])



In [15]:

    
import dask.dataframe as dd



In [16]:

    
cols = ['square_id', 'timestamp', 'country_code',
        'sms_in', 'sms_out','call_in','call_out', 'internet']

dtypes = {'square_id': int, 'timestamp': int, 'countrycode': int, 
          'sms_in': float,'sms_out': float, 'call_in': float, 'call_out': float, 'internet': float}



In [17]:

    
df = dd.read_csv?



In [ ]:

    
df = dd.read_csv

MISSING SEPERATOR ARGS FOR SPACE DELIMITED FILE!!!



In [24]:

    
df_a = dd.read_csv('data/split/*.csv', header=0, names=cols, dtype=dtypes, sep="\t")



In [28]:









    



dd.Scalar<size-ag..., dtype=int64>